Code
# read in csv
names <- read.csv(here('labs', 'lab_9', 'StateNames_A.csv'))
# interactive preview with DT
DT::datatable(names)# read in csv
names <- read.csv(here('labs', 'lab_9', 'StateNames_A.csv'))
# interactive preview with DT
DT::datatable(names)allison <-names |>
rename(Sex = Gender) |>
filter(Name == "Allison")
allison |>
group_by(State, Sex) |>
summarize(Count = sum(Count)) |>
ungroup() |>
pivot_wider(names_from = Sex,
values_from = Count,
values_fill = 0) |>
gt() |>
cols_label(F = "Female",
M = "Male") |>
tab_header(title = "Frequency of Babies Named 'Allison' by U.S. State") |>
opt_align_table_header(align = c("left")) |>
cols_width(everything()~ px(100))| Frequency of Babies Named 'Allison' by U.S. State | ||
| State | Female | Male |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
| IN | 3067 | 0 |
| KS | 1283 | 0 |
| KY | 1905 | 20 |
| LA | 1209 | 0 |
| MA | 2218 | 0 |
| MD | 2229 | 0 |
| ME | 340 | 0 |
| MI | 4014 | 0 |
| MN | 2374 | 0 |
| MO | 2882 | 0 |
| MS | 817 | 0 |
| MT | 226 | 0 |
| NC | 3435 | 0 |
| ND | 285 | 0 |
| NE | 807 | 0 |
| NH | 412 | 0 |
| NJ | 3052 | 0 |
| NM | 399 | 0 |
| NV | 729 | 0 |
| NY | 5747 | 0 |
| OH | 5487 | 0 |
| OK | 1421 | 0 |
| OR | 1186 | 0 |
| PA | 4307 | 0 |
| RI | 306 | 0 |
| SC | 1228 | 0 |
| SD | 376 | 0 |
| TN | 2488 | 0 |
| TX | 10192 | 0 |
| UT | 1125 | 0 |
| VA | 3220 | 0 |
| VT | 135 | 0 |
| WA | 1956 | 0 |
| WI | 2367 | 0 |
| WV | 813 | 0 |
| WY | 142 | 0 |
# filter out only females
allison_f <- allison |>
filter(Sex == "F")
allison_f <- allison_f |>
group_by(Year) |>
summarize(Count = sum(Count))
ggplot(data = allison_f,
mapping = aes(x = Year, y = Count)) +
geom_point() +
geom_line()+
stat_smooth(method = "lm") +
labs(title = "Number of Babies Named 'Allison' Over Time")The estimated regression equation is:
y = -102x +209690
allison_lm <- lm(Count ~ Year, data = allison_f)
broom::tidy(allison_lm)# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 209690. 42972. 4.88 0.000167
2 Year -102. 21.4 -4.74 0.000223
The residuals do not have equal variance. The fitted values of 6000-6500 have residual values of around -500.
allison_lm |>
broom::augment() |>
ggplot(mapping = aes(y = .resid, x = .fitted)) +
geom_point()The name Allison appears to be decreasing in popularity. In general, every year there are 102 fewer children named Allison in the US.
allan_variants <- c("Allan", "Alan", "Allen")
allan_m <- names |>
rename(Sex = Gender) |>
filter(Sex == "M",
Name %in% allan_variants)
allan_plot <- allan_m |>
group_by(Year) |>
summarize(Count = sum(Count))
ggplot(data = allan_plot,
mapping = aes(x = Year, y = Count)) +
geom_point() +
geom_line()+
stat_smooth(method = "lm") +
labs(title = "Number of Babies Named 'Allan' Over Time")allan_m |>
filter(Year == 2000,
State %in% c("CA", "PA")) |>
group_by(State) |>
mutate(prop = Count / sum(Count)) |>
select(-c(Year, Sex, Count)) |>
pivot_wider(names_from = Name,
values_from = prop,
values_fill = 0) |>
ungroup() |>
gt() |>
tab_header(title = "Percent of Babies Named 'Alan'",
subtitle = "Comparing California and Pennsylvania") |>
fmt_percent(columns = 2:4, decimals = 2) |>
opt_align_table_header(align = c("left")) |>
cols_width(everything()~ px(100))| Percent of Babies Named 'Alan' | |||
| Comparing California and Pennsylvania | |||
| State | Alan | Allen | Allan |
|---|---|---|---|
| CA | 65.35% | 19.86% | 14.79% |
| PA | 42.86% | 47.06% | 10.08% |